import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import geopandas as gpd
import folium
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from datetime import datetime
from sklearn.metrics import mean_squared_error, r2_score
df = pd.read_excel('turbo_az_dataset.xlsx',names = ['City', 'Price', 'Make', 'Model', 'Year', 'Body_Type', 'Color', 'Engine', 'Kilometrage', 'Transmission', 'Drivetrain', 'Is_New', 'Number_of_Seats', 'Condition', 'Market', 'Damaged', 'Owners'])
df.head()
| City | Price | Make | Model | Year | Body_Type | Color | Engine | Kilometrage | Transmission | Drivetrain | Is_New | Number_of_Seats | Condition | Market | Damaged | Owners | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Bərdə | 8 500 AZN | GAZ | 53 | 1992 | Yük maşını | Yaş Asfalt | 4.3 L / 150 a.g. / Benzin | 500 000 km | Mexaniki (MT) | Arxa | Xeyr | NaN | Vuruğu yoxdur, rənglənməyib | NaN | NaN | NaN |
| 1 | Tovuz | 8 500 AZN | Mercedes | C 180 | 1994 | Sedan | Gümüşü | 1.8 L / 122 a.g. / Benzin | 596 890 km | Avtomat (AT) | Arxa | Xeyr | 5 | Vuruğu yoxdur, rənglənib | Amerika | NaN | NaN |
| 2 | Ağdaş | 2 500 AZN | LADA (VAZ) | 2108 | 1989 | Hetçbek, 3 qapı | Sarı | 1.5 L / 78 a.g. / Benzin | 172 820 km | Mexaniki (MT) | Ön | Xeyr | 5 | Vuruğu var, rənglənib | NaN | NaN | NaN |
| 3 | Sumqayıt | 28 900 AZN | BMW | 528 | 2012 | Sedan | Ağ | 2.0 L / 245 a.g. / Benzin | 152 410 km | Avtomat (AT) | Arxa | Xeyr | 5 | Vuruğu yoxdur, rənglənməyib | Amerika | NaN | NaN |
| 4 | Bakı | 21 500 AZN | Hyundai | ix35 | 2012 | Offroader / SUV, 5 qapı | Ağ | 2.0 L / 150 a.g. / Benzin | 270 000 km | Avtomat (AT) | Ön | Xeyr | 5 | Vuruğu yoxdur, rənglənməyib | Rəsmi diler | NaN | NaN |
df.shape
(20078, 17)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20078 entries, 0 to 20077 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 City 20078 non-null object 1 Price 20078 non-null object 2 Make 20078 non-null object 3 Model 20078 non-null object 4 Year 20078 non-null int64 5 Body_Type 20078 non-null object 6 Color 20078 non-null object 7 Engine 20078 non-null object 8 Kilometrage 20078 non-null object 9 Transmission 20078 non-null object 10 Drivetrain 20078 non-null object 11 Is_New 20078 non-null object 12 Number_of_Seats 17893 non-null object 13 Condition 20077 non-null object 14 Market 16334 non-null object 15 Damaged 103 non-null object 16 Owners 8014 non-null object dtypes: int64(1), object(16) memory usage: 2.6+ MB
3.1 Fixing Inconsistencies¶
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20078 entries, 0 to 20077 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 City 20078 non-null object 1 Price 20078 non-null object 2 Make 20078 non-null object 3 Model 20078 non-null object 4 Year 20078 non-null int64 5 Body_Type 20078 non-null object 6 Color 20078 non-null object 7 Engine 20078 non-null object 8 Kilometrage 20078 non-null object 9 Transmission 20078 non-null object 10 Drivetrain 20078 non-null object 11 Is_New 20078 non-null object 12 Number_of_Seats 17893 non-null object 13 Condition 20077 non-null object 14 Market 16334 non-null object 15 Damaged 103 non-null object 16 Owners 8014 non-null object dtypes: int64(1), object(16) memory usage: 2.6+ MB
Let's look for inconsistencies in dataset. We can clearly see that when some columns should be numeric but they are in object data type. Such as Price , Engine, Kilometrage, Number_of_Seats. Let's transform them to numeric columns first.
Pricecolumn
In price column some values are in different currencies. We need to bring them to a single currency and change the column to a numeric dtype.
df['Price']
0 8 500 AZN
1 8 500 AZN
2 2 500 AZN
3 28 900 AZN
4 21 500 AZN
...
20073 14 300 AZN
20074 19 800 AZN
20075 34 000 AZN
20076 ≈ 86 360 AZN50 800 USD
20077 18 500 AZN
Name: Price, Length: 20078, dtype: object
# keep prices only in AZN currency
df['Price'] = df['Price'].str.split('AZN').str[0]
# removing any extra spaces and characters just to keep numbers
df['Price'] = df['Price'].str.replace(r'[^0-9]','', regex = True)
df['Price']= df['Price'].astype(int)
df['Price']
0 8500
1 8500
2 2500
3 28900
4 21500
...
20073 14300
20074 19800
20075 34000
20076 86360
20077 18500
Name: Price, Length: 20078, dtype: int32
Enginecolumn
df['Engine']
0 4.3 L / 150 a.g. / Benzin
1 1.8 L / 122 a.g. / Benzin
2 1.5 L / 78 a.g. / Benzin
3 2.0 L / 245 a.g. / Benzin
4 2.0 L / 150 a.g. / Benzin
...
20073 1.8 L / 204 a.g. / Benzin
20074 1.5 L / 184 a.g. / Benzin
20075 1.6 L / 183 a.g. / Benzin
20076 1.6 L / 230 a.g. / Hibrid
20077 1.5 L / 160 a.g. / Benzin
Name: Engine, Length: 20078, dtype: object
We see that Engine column combines data for 3 different values (engine, engine power,fuel type). We should separate it into 3 distinct columns.
df[['Engine(L)','Engine_Power','Fuel_Type']] = df['Engine'].str.split('/',expand = True)
df.loc[df['Make'] == 'Tesla',['Engine(L)','Engine_Power','Fuel_Type']]
| Engine(L) | Engine_Power | Fuel_Type | |
|---|---|---|---|
| 331 | 283 a.g. | Elektro | None |
| 341 | 770 a.g. | Elektro | None |
| 406 | 345 a.g. | Elektro | None |
| 699 | 1020 a.g. | Elektro | None |
| 711 | 450 a.g. | Elektro | None |
| 759 | 554 a.g. | Elektro | None |
| 817 | 462 a.g. | Elektro | None |
| 844 | 450 a.g. | Elektro | None |
| 1893 | 283 a.g. | Elektro | None |
| 2111 | 562 a.g. | Elektro | None |
| 2134 | 450 a.g. | Elektro | None |
| 2478 | 562 a.g. | Elektro | None |
| 2861 | 258 a.g. | Elektro | None |
| 2920 | 462 a.g. | Elektro | None |
| 3051 | 351 a.g. | Elektro | None |
| 3065 | 428 a.g. | Elektro | None |
| 3168 | 351 a.g. | Elektro | None |
| 3229 | 321 a.g. | Elektro | None |
| 4094 | 534 a.g. | Elektro | None |
| 4122 | 462 a.g. | Elektro | None |
| 5014 | 450 a.g. | Elektro | None |
| 5775 | 351 a.g. | Elektro | None |
| 6198 | 510 a.g. | Elektro | None |
| 6756 | 351 a.g. | Elektro | None |
| 7250 | 500 a.g. | Elektro | None |
| 7885 | 462 a.g. | Elektro | None |
| 8631 | 845 a.g. | Elektro | None |
| 9190 | 351 a.g. | Elektro | None |
| 9834 | 450 a.g. | Elektro | None |
| 9905 | 450 a.g. | Elektro | None |
| 10055 | 258 a.g. | Elektro | None |
| 10262 | 450 a.g. | Elektro | None |
| 10335 | 462 a.g. | Elektro | None |
| 10556 | 450 a.g. | Elektro | None |
| 10697 | 283 a.g. | Elektro | None |
| 11587 | 450 a.g. | Elektro | None |
| 11597 | 562 a.g. | Elektro | None |
| 11707 | 670 a.g. | Elektro | None |
| 12636 | 462 a.g. | Elektro | None |
| 12965 | 560 a.g. | Elektro | None |
| 14044 | 260 a.g. | Elektro | None |
| 14684 | 690 a.g. | Elektro | None |
| 14745 | 283 a.g. | Elektro | None |
| 14759 | 845 a.g. | Elektro | None |
| 15708 | 462 a.g. | Elektro | None |
| 15718 | 351 a.g. | Elektro | None |
| 15822 | 562 a.g. | Elektro | None |
| 15891 | 462 a.g. | Elektro | None |
| 16550 | 470 a.g. | Elektro | None |
| 16601 | 462 a.g. | Elektro | None |
| 16979 | 450 a.g. | Elektro | None |
| 17321 | 462 a.g. | Elektro | None |
| 17460 | 462 a.g. | Elektro | None |
| 17580 | 283 a.g. | Elektro | None |
| 17846 | 283 a.g. | Elektro | None |
| 17890 | 304 a.g. | Elektro | None |
| 19411 | 1020 a.g. | Elektro | None |
| 19494 | 540 a.g. | Elektro | None |
| 19511 | 450 a.g. | Elektro | None |
| 19643 | 462 a.g. | Elektro | None |
For the electrical cars values of the three columns above are mixed. This problem needs to be fixed.
df.loc[df['Engine'].str.contains('Elektro'),'Fuel_Type'] = 'Electric'
df.loc[df['Engine'].str.contains('Elektro'),'Engine_Power'] = df['Engine(L)']
df.loc[df['Engine'].str.contains('Elektro'),'Engine(L)'] = '0 L'
Engine Powercolumn
df['Engine(L)'] = df['Engine(L)'].str.replace(r'[a-zA-Z\s]','',regex = True).astype(float)
df.loc[df['Engine'].str.contains('sm3'),'Engine(L)'] =df.loc[df['Engine'].str.contains('sm3'),'Engine(L)'] /1000
Converted sm3 to liters
Kilometrage,Engine_Power(a.g)andNumber_of_Seatscolumns
df
| City | Price | Make | Model | Year | Body_Type | Color | Engine | Kilometrage | Transmission | Drivetrain | Is_New | Number_of_Seats | Condition | Market | Damaged | Owners | Engine(L) | Engine_Power | Fuel_Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Bərdə | 8500 | GAZ | 53 | 1992 | Yük maşını | Yaş Asfalt | 4.3 L / 150 a.g. / Benzin | 500 000 km | Mexaniki (MT) | Arxa | Xeyr | NaN | Vuruğu yoxdur, rənglənməyib | NaN | NaN | NaN | 4.3 | 150 a.g. | Benzin |
| 1 | Tovuz | 8500 | Mercedes | C 180 | 1994 | Sedan | Gümüşü | 1.8 L / 122 a.g. / Benzin | 596 890 km | Avtomat (AT) | Arxa | Xeyr | 5 | Vuruğu yoxdur, rənglənib | Amerika | NaN | NaN | 1.8 | 122 a.g. | Benzin |
| 2 | Ağdaş | 2500 | LADA (VAZ) | 2108 | 1989 | Hetçbek, 3 qapı | Sarı | 1.5 L / 78 a.g. / Benzin | 172 820 km | Mexaniki (MT) | Ön | Xeyr | 5 | Vuruğu var, rənglənib | NaN | NaN | NaN | 1.5 | 78 a.g. | Benzin |
| 3 | Sumqayıt | 28900 | BMW | 528 | 2012 | Sedan | Ağ | 2.0 L / 245 a.g. / Benzin | 152 410 km | Avtomat (AT) | Arxa | Xeyr | 5 | Vuruğu yoxdur, rənglənməyib | Amerika | NaN | NaN | 2.0 | 245 a.g. | Benzin |
| 4 | Bakı | 21500 | Hyundai | ix35 | 2012 | Offroader / SUV, 5 qapı | Ağ | 2.0 L / 150 a.g. / Benzin | 270 000 km | Avtomat (AT) | Ön | Xeyr | 5 | Vuruğu yoxdur, rənglənməyib | Rəsmi diler | NaN | NaN | 2.0 | 150 a.g. | Benzin |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20073 | Bakı | 14300 | Mercedes | C 250 | 2008 | Sedan | Ağ | 1.8 L / 204 a.g. / Benzin | 248 850 km | Avtomat (AT) | Arxa | Xeyr | 5 | Vuruğu yoxdur, rənglənməyib | Amerika | NaN | NaN | 1.8 | 204 a.g. | Benzin |
| 20074 | Bakı | 19800 | Ford | Fusion (North America) | 2017 | Offroader / SUV, 5 qapı | Yaş Asfalt | 1.5 L / 184 a.g. / Benzin | 142 000 km | Avtomat (AT) | Ön | Xeyr | 5 | Vuruğu yoxdur, rənglənməyib | Digər | NaN | NaN | 1.5 | 184 a.g. | Benzin |
| 20075 | Bakı | 34000 | Hyundai | Sonata | 2020 | Sedan | Qara | 1.6 L / 183 a.g. / Benzin | 70 000 km | Avtomat (AT) | Ön | Xeyr | 5 | Vuruğu yoxdur, rənglənib | Amerika | NaN | NaN | 1.6 | 183 a.g. | Benzin |
| 20076 | Bakı | 86360 | Hyundai | Grandeur | 2023 | Sedan | Qara | 1.6 L / 230 a.g. / Hibrid | 25 000 km | Avtomat (AT) | Ön | Xeyr | NaN | Vuruğu yoxdur, rənglənməyib | NaN | NaN | NaN | 1.6 | 230 a.g. | Hibrid |
| 20077 | Bakı | 18500 | Chevrolet | Malibu | 2017 | Sedan | Ağ | 1.5 L / 160 a.g. / Benzin | 179 264 km | Avtomat (AT) | Ön | Xeyr | 5 | Vuruğu yoxdur, rənglənməyib | Amerika | NaN | NaN | 1.5 | 160 a.g. | Benzin |
20078 rows × 20 columns
df[['Kilometrage','Engine_Power']] = df[['Kilometrage','Engine_Power']].apply(
lambda x: x.str.replace(r'[^0-9]','',regex = True).astype(int)
)
df['Number_of_Seats'] = pd.to_numeric(df['Number_of_Seats'], errors = 'coerce')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20078 entries, 0 to 20077 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 City 20078 non-null object 1 Price 20078 non-null int32 2 Make 20078 non-null object 3 Model 20078 non-null object 4 Year 20078 non-null int64 5 Body_Type 20078 non-null object 6 Color 20078 non-null object 7 Engine 20078 non-null object 8 Kilometrage 20078 non-null int32 9 Transmission 20078 non-null object 10 Drivetrain 20078 non-null object 11 Is_New 20078 non-null object 12 Number_of_Seats 17700 non-null float64 13 Condition 20077 non-null object 14 Market 16334 non-null object 15 Damaged 103 non-null object 16 Owners 8014 non-null object 17 Engine(L) 20078 non-null float64 18 Engine_Power 20078 non-null int32 19 Fuel_Type 20078 non-null object dtypes: float64(2), int32(3), int64(1), object(14) memory usage: 2.8+ MB
Now we see that Price , Engine(L), Kilometrage, Engine_Power(a.g), Number_of_Seats columns are in numeric data type.
Dealing with missing values¶
df.isnull().sum()
City 0 Price 0 Make 0 Model 0 Year 0 Body_Type 0 Color 0 Engine 0 Kilometrage 0 Transmission 0 Drivetrain 0 Is_New 0 Number_of_Seats 2378 Condition 1 Market 3744 Damaged 19975 Owners 12064 Engine(L) 0 Engine_Power 0 Fuel_Type 0 dtype: int64
null_cols = df.isnull().mean()
null_cols.loc[null_cols>0]
Number_of_Seats 0.118438 Condition 0.000050 Market 0.186473 Damaged 0.994870 Owners 0.600857 dtype: float64
There are null values in 5 columns. Let's go one by one.
Number_of_seatscolumn
df['Number_of_Seats'].isnull().sum()
2378
df_train = df.dropna(subset = ['Number_of_Seats']).copy()
df_predict = df.loc[df['Number_of_Seats'].isnull()].copy()
X_train = df_train[['Make', 'Model', 'Body_Type']]
y_train = df_train['Number_of_Seats']
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_predict = df_predict[['Make', 'Model', 'Body_Type']]
X_predict_encoded = pd.get_dummies(X_predict, drop_first=True)
X_predict_encoded = X_predict_encoded.reindex(columns = X_train_encoded.columns,fill_value = 0) #ensuring consistency with thr trainig dataset
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_encoded,y_train)
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
predicted_seats = model.predict(X_predict_encoded)
df.loc[df['Number_of_Seats'].isnull(),'Number_of_Seats'] = predicted_seats
Marketcolumn
df['Market'].isnull().sum()
3744
df['Market'].unique()
array([nan, 'Amerika', 'Rəsmi diler', 'Yaponiya', 'Avropa', 'Rusiya',
'Koreya', 'Çin', 'Digər', 'Dubay'], dtype=object)
df_market_train = df.dropna(subset = ['Market']).copy()
df_market_predict = df.loc[df['Market'].isnull()].copy()
X_train = df_market_train[['Make', 'Model']]
y_train = df_market_train['Market']
X_train_encoded = pd.get_dummies(X_train, drop_first = True)
#y_train_encoded = pd.get_dummies(y_train, drop_first = True)
X_predict = df_market_predict[['Make', 'Model']]
X_predict_encoded = pd.get_dummies(X_predict , drop_first = True)
X_predict_encoded = X_predict_encoded.reindex(columns = X_train_encoded.columns,fill_value = 0)
model = RandomForestClassifier(n_estimators=100,random_state = 42)
model.fit(X_train_encoded,y_train)
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
predicted_markets = model.predict(X_predict_encoded)
df.loc[df['Market'].isnull(),'Market'] = predicted_markets
Conditioncolumn
df['Condition'].isnull().sum()
1
df.loc[df['Condition'].isnull()]
| City | Price | Make | Model | Year | Body_Type | Color | Engine | Kilometrage | Transmission | Drivetrain | Is_New | Number_of_Seats | Condition | Market | Damaged | Owners | Engine(L) | Engine_Power | Fuel_Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 18224 | Bakı | 19500 | Ssang Yong | Korando | 2013 | Offroader / SUV, 5 qapı | Ağ | 2.0 L / 149 a.g. / Benzin | 196000 | Avtomat (AT) | Tam | Xeyr | 5.0 | NaN | Rəsmi diler | NaN | 2 | 2.0 | 149 | Benzin |
There is only 1 missing row. So , it is better to drop that row.
df = df.dropna(subset = 'Condition')
DamagedandOwnerscolumns
These 2 columns will be dropped because more than 50% is null
df['Damaged'].value_counts()
Damaged Bəli 103 Name: count, dtype: int64
damaged_cars = df.loc[df['Damaged'] == 'Bəli']
plt.figure(figsize=(10, 6))
damaged_cars['Make'].value_counts().plot(kind = 'bar')
<Axes: xlabel='Make'>
From the visual, we can see that most of the damaged cars for sale are Mercedes, Opel, Lada (VAZ), Hyundai and Ford.
df = df.drop(columns = ['Damaged','Owners'])
Removing Duplicates¶
df.duplicated().sum()
162
# dropping duplicates
df.drop_duplicates(inplace = True)
Univariate Analysis¶
df.describe()
| Price | Year | Kilometrage | Number_of_Seats | Engine(L) | Engine_Power | |
|---|---|---|---|---|---|---|
| count | 19915.000000 | 19915.000000 | 1.991500e+04 | 19915.000000 | 19915.000000 | 19915.000000 |
| mean | 32006.302234 | 2012.264524 | 1.856518e+05 | 5.030178 | 2.226406 | 185.939292 |
| std | 42656.455048 | 9.199316 | 1.549953e+05 | 0.905965 | 1.426442 | 106.016526 |
| min | 500.000000 | 1906.000000 | 0.000000e+00 | 1.000000 | 0.000000 | 3.000000 |
| 25% | 11850.000000 | 2007.000000 | 7.500000e+04 | 5.000000 | 1.500000 | 114.000000 |
| 50% | 21500.000000 | 2013.000000 | 1.700000e+05 | 5.000000 | 2.000000 | 164.000000 |
| 75% | 35500.000000 | 2019.000000 | 2.600000e+05 | 5.000000 | 2.400000 | 224.000000 |
| max | 777479.000000 | 2025.000000 | 1.635680e+06 | 7.000000 | 20.003000 | 1548.000000 |
Insights:
Price: The average price for a car in Azerbaijan is around 32 006 AZN. However the median price is 21 500 AZN, which is much lower than mean. It means data is skewed heavily to the right. Price range is very large, from 500 AZN to 777 479 AZN.
Year: The oldest car for sale on turbo.az is from 1906 and the newest is from 2025. The average car in this dataset is from the year 2012.
Engine(L): Average engine size is around 2.23 liters. There are also electrical cars listed with 0 L engines and car(s) with very large engine size up to 20 L.
Kilometrage: The mean kilometrage for cars are 185K km and highest one is 1.635 mln km. There also new cars as well with 0 kilometrage.
Engine_Power: The average horsepower is around 185. Most powerful car has a power of 1548 while the least power car has a power 3.
df[['Price','Kilometrage','Year','Engine_Power','Engine(L)']].hist(bins=10, figsize=(10, 6))
plt.show()
From the the distributions we see that many columns have positive skewness.
Price - we can say majority of the cars are in lower price and there are a few and very expensive cars.
Year - Year of the majority of the cars are new (from 2000) with a smaller number of very old cars.
Kilometrage, Engine_Power, Engine(L) - the same as price column, thye have right skewness. There are very few cars with high kilometrage, engine size and power.
df['Make'].value_counts().head(10).plot(kind = 'bar',figsize = (18,6))
plt.xlabel('Make')
plt.title('Top 10 Most Common Car Makes')
plt.show()
Top 3 car brands in sale on turbo.az are Mercedes, Hyundai and Toyata
regions = df['City'].value_counts()
more_cars_regions = regions[regions > 150]
more_cars_regions.plot(kind = 'pie', figsize = (6,6),fontsize = 12.5,
explode = [0.05]*len(more_cars_regions),
ylabel = '',
legend=True,
startangle= 160, autopct='%1.1f%%')
plt.title('Cars by Region with More Than 150',fontsize=15, color = '#006400')
plt.show()
Regions with most of the cars are Bakı, Sumqayıt, Gəncə,Lənkəran and Xırdalan
df.columns
Index(['City', 'Price', 'Make', 'Model', 'Year', 'Body_Type', 'Color',
'Engine', 'Kilometrage', 'Transmission', 'Drivetrain', 'Is_New',
'Number_of_Seats', 'Condition', 'Market', 'Engine(L)', 'Engine_Power',
'Fuel_Type'],
dtype='object')
plt.subplot(1,2,1)
df['Body_Type'].value_counts().plot(kind = 'bar', figsize = (18,6),fontsize = 12.5,color = '#737373')
plt.title('Cars by Body Type',fontsize=15)
plt.subplot(1,2,2)
df['Color'].value_counts().plot(kind = 'bar', figsize = (18,6),fontsize = 12.5,color = '#08659f')
plt.title('Cars by Color', fontsize=15)
plt.show()
The following inferences can be made from the above bar plots:
Body Type of most of the cars are 'Sedan', 'Offroader / SUV', 'Hetchbek', 'Liftbek' and 'Universal'
And color of most of the cars are 'Ağ', 'Qara', 'Gümüşü', 'Yaş Asfalt'and 'Göy'
plt.subplot(1,2,1)
df['Fuel_Type'].value_counts().plot(kind = 'bar', figsize = (18,6),fontsize = 12.5,color = '#737373')
plt.title('Cars by Fuel Type',fontsize=15)
plt.subplot(1,2,2)
df['Transmission'].value_counts().plot(kind = 'bar', figsize = (18,6),fontsize = 12.5,color = '#08659f')
plt.title('Cars by Transmission', fontsize=15)
plt.show()
The following inferences can be made from the above bar plots:
Body Type of most of the cars are 'Sedan', 'Offroader / SUV', 'Hetchbek', 'Liftbek' and 'Universal'
And color of most of the cars are 'Ağ', 'Qara', 'Gümüşü', 'Yaş Asfalt'and 'Göy'
Bivariate Analysis¶
plt.figure(figsize = (15,6))
sns.boxplot(data = df, x = 'Fuel_Type', y = 'Price')
plt.title('Price Distribution by Fuel Type',fontsize=15)
plt.show()
- Median price for Electric and dizel-hibrid cars are higher than others. So, it means these cars are usually more expensive than others.
- The box for Benzin and Qaz categories are smaller than others. It means for these categories 50% of data concentrated among small price range (difference between min and max price) is higher.
- There are outliers across almost all types. It mean in each category there are very expensive cars for all fuel types.
- Benzin fuel type has the highest range.
Geospatial Analysis¶
az_regions_gdf = gpd.read_file('azrbaycan_rayonlar.geojson')
az_regions_gdf.head(2)
| OBJECTID | Name_AZ | Name | PARENT_ID | Code_OSM | Name_RU | Area | Shape_Leng | Shape_Le_1 | ID | Shape_Le_2 | Shape_Length | Shape_Area | Upper_case | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Qax | Gakh | 1 | 1204 | Гах | 1487.563667 | 3.145374 | 3.145374 | 403.0 | 3.145374 | 3.145374 | 0.159953 | QAX | POLYGON ((47.01821 41.5551, 47.01816 41.55467,... |
| 1 | 2 | Zaqatala | Zagatala | 1 | 1204 | Закатала | 1354.885693 | 2.853307 | 2.853307 | 402.0 | 2.853307 | 2.853307 | 0.146303 | ZAQATALA | MULTIPOLYGON (((46.47881 41.64208, 46.47853 41... |
Data source: https://opendata.az/
az_regions_gdf.loc[az_regions_gdf.Name_AZ.isin(['Nərimanov', 'Nizami', 'Pirallahı','Qaradağ', 'Sabunçu', 'Səbail', 'Suraxanı', 'Xətai', 'Xəzər',
'Yasamal', 'Binəqədi', 'Nəsimi']),'Name_AZ'] = 'Bakı'
df_agg = df.groupby('City').agg(
count_car = ('Price','count'),
avg_price = ('Price','mean')
).reset_index()
df_agg['count_car'] = df_agg['count_car'].round(0).astype(int)
df_agg['avg_price'] = df_agg['avg_price'].round(2)
df_agg.loc[df_agg.City == 'Xırdalan','City'] = 'Abşeron'
choropleth_gdf = az_regions_gdf.merge(df_agg, how = 'left',left_on = 'Name_AZ', right_on = 'City')
m = folium.Map(location=[40.1, 47.5], zoom_start=7)
# Create the interactive choropleth layer
folium.Choropleth(
geo_data=choropleth_gdf,
data=choropleth_gdf,
columns=['Name', 'count_car'],
key_on='feature.properties.Name',
fill_color='plasma',
fill_opacity=0.7,
line_opacity=0.2,
legend_name='Average Car Price (AZN)'
).add_to(m)
# Add tooltips for interaction
style_function = lambda x: {'fillColor': '#ffffff', 'color':'#000000', 'fillOpacity': 0.1, 'weight': 0.1}
highlight_function = lambda x: {'fillColor': '#000000', 'color':'#000000', 'fillOpacity': 0.50, 'weight': 0.1}
NIL = folium.features.GeoJson(
choropleth_gdf,
style_function=style_function,
control=False,
highlight_function=highlight_function,
tooltip=folium.features.GeoJsonTooltip(
fields=['Name', 'avg_price', 'count_car'],
aliases=['City:', 'Avg Price:', 'Total Cars:'],
style=("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;")
)
)
m.add_child(NIL)
m.save("car_price_map.html")
m